---
title: "Project"
author: ""
date: "2023-12-11"
output:
flexdashboard::flex_dashboard:
orientation: rows
social: ["wechat"]
theme: united
source_code: embed
---
```{r setup, include=FALSE}
library(ggplot2)
library(plotly)
library(dplyr)
library(stringr)
library(tidyr)
library(flexdashboard)
library(knitr)
library(tidyverse)
library(purrr) # for functional programming
library(wordcloud2) # for creating word cloud
```
```{r}
df <- read.csv("data/netflix_titles.csv")
```
# List_in {.storyboard}
### Total prevalence type & Total production quantity
```{r}
data <- df %>% mutate(release_year = as.numeric(str_sub(release_year, -4))) # get years
data <- data %>% separate_rows(listed_in, sep = ", ") %>% # Separate the listed in lines by commas
arrange(release_year) # Sort by year
```
```{r}
# Statistics of the top ten quantity types each year
top10_types <- data %>%
group_by(release_year, listed_in) %>%
summarise(type_count = n()) %>%
arrange(release_year, desc(type_count)) %>%
group_by(release_year) %>%
top_n(10)
# top10_types
```
```{r}
# List number of times
grouped_table <- top10_types %>%
group_by(listed_in) %>%
summarise(count = n()) %>%
arrange(desc(count))
# grouped_table
```
```{r}
colnames(grouped_table) <- c("Type_name", "epidemic_years")
p1 <- ggplot(data = grouped_table, mapping = aes(
x = reorder(Type_name, -epidemic_years),
y = epidemic_years, fill = epidemic_years
)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "#221f1f", high = "#b20710") + # Set color gradient
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Total prevalence type", x = "type", y = "The number of epidemic years of a type")
ggplotly(p1)
```
```{r}
total_table <- top10_types %>%
group_by(listed_in) %>%
summarise(total_count = sum(type_count), .groups = "drop")
# total_table
# The summary table is arranged according to the number of occurrences from largest to smallest
total_table <- total_table %>% arrange(desc(total_count))
# colnames(total_table) <- c("Type name", "List times")
# total_table
```
### Total prevalence type & Total production quantity
```{r}
colnames(total_table) <- c("Type_name", "Production_volume")
# Summary of the number of production types
p2 <- ggplot(data = total_table, mapping = aes(
x = reorder(Type_name, -Production_volume),
y = Production_volume, fill = Production_volume
)) +
geom_bar(stat = "identity") +
scale_fill_gradient(high = "#b20710", low = "#221f1f") + # Set color gradient
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Total production quantity", x = "type", y = "Production volume")
ggplotly(p2)
```
Duration
============================
Row
-------------------
### Movie Duration vs. Decade
```{r}
data <- subset(df, type == "Movie")
data$duration <- as.numeric(str_extract(data$duration, "\\d+"))
data$decade <- cut(data$release_year, breaks = seq(1920, 2040, by = 10), labels = seq(1920, 2030, by = 10))
ggplot(data %>% filter(duration <= 200), aes(x = decade, y = duration)) +
geom_boxplot(fill = "#b20710", color = "#221f1f", notch = TRUE) +
labs(
title = "Duration Distribution of Movies by Decade",
x = "Decade",
y = "Duration"
) +
theme_minimal() +
scale_y_continuous(breaks = seq(0, 200, 50), labels = paste0(seq(0, 200, 50), " minutes"))
```
### TV Show Duration vs. Decade
```{r}
data <- subset(df, type == "TV Show")
data$duration <- as.numeric(str_extract(data$duration, "\\d+"))
data$decade <- cut(data$release_year, breaks = seq(1920, 2040, by = 10), labels = seq(1920, 2030, by = 10))
ggplot(data, aes(x = decade, y = duration)) +
geom_boxplot(fill = "#b20710", color = "#221f1f", notch = TRUE) +
labs(
title = "Duration Distribution of TV Show by Decade",
x = "Decade",
y = "Duration"
) +
theme_minimal() +
scale_y_continuous(breaks = seq(0, 20, 5), labels = paste0(seq(0, 20, 5), " seasons"))
```
Row
-------------------
### Movie Duration vs. Genre
```{r}
data <- df %>%
filter(type == "Movie") %>%
mutate(duration = as.numeric(str_extract(duration, "\\d+"))) %>%
separate_rows(listed_in, sep = ", ") %>%
group_by(listed_in) %>%
summarise(avg_duration = mean(duration, na.rm = TRUE))
ggplot(data, aes(x = reorder(listed_in, avg_duration), y = avg_duration, fill = avg_duration)) +
geom_bar(stat = "identity") +
labs(
title = "Average Duration of Movies by Genre",
x = "Genre",
y = "Average Duration (minutes)",
fill = "Average Duration (minutes)"
) +
theme_minimal() +
scale_y_continuous(breaks = seq(0, 150, 20), labels = seq(0, 150, 20)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
coord_flip() +
scale_fill_gradient(low = "#221f1f", high = "#b20710")
```
### TV Show Duration vs. Genre
```{r}
data <- df %>%
filter(type == "TV Show") %>%
mutate(duration = as.numeric(str_extract(duration, "\\d+"))) %>%
separate_rows(listed_in, sep = ", ") %>%
group_by(listed_in) %>%
summarise(avg_duration = mean(duration, na.rm = TRUE))
# 绘制图表
ggplot(data, aes(x = reorder(listed_in, avg_duration), y = avg_duration, fill = avg_duration)) +
geom_bar(stat = "identity") +
labs(
title = "Average Duration of TV Shows by Genre",
x = "Genre",
y = "Average Duration (seasons)",
fill = "Average Duration (seasons)"
) +
theme_minimal() +
scale_y_continuous(breaks = seq(0, 7, 1), labels = seq(0, 7, 1)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
coord_flip() +
scale_fill_gradient(high = "#221f1f", low = "#b20710")
```
Rating
=====================================
## Column {.tabset data-width="650"}
```{r include=FALSE}
mydata <- df[df$rating != "", ]
l <- unique(mydata$rating)
l
plotdata <- mydata %>%
group_by(rating) %>%
summarize(n = n())
plotdata
```
### The rating category under different type
```{r}
plotdata <- mydata %>%
group_by(type, rating) %>%
summarize(n = n())
category_type_data <- plotdata %>%
mutate(category = case_when(
rating %in% c("G", "TV-G", "TV-Y") ~ "Little Kids",
rating %in% c("PG", "TV-PG", "TV-Y7") ~ "Older Kids",
rating %in% c("TV-Y7-FV", "PG-13", "TV-14") ~ "Teenagers",
rating %in% c("NC-17", "NR", "R", "TV-MA", "UR") ~ "Adults",
TRUE ~ "Other" # Add a default category for any unexpected ratings
))
category_type_data$category <- factor(category_type_data$category,
levels = c("Little Kids", "Older Kids", "Teenagers", "Adults"),
ordered = TRUE
)
p <- category_type_data %>%
ggplot(aes(rating, n, fill = type, label = n)) +
geom_bar(stat = "identity", position = "stack") +
geom_text(position = position_stack(vjust = 0.5), color = "white", size = 3) +
labs(x = "Rating", y = "Count", title = "Distribution of Ratings with Type") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
panel.border = element_blank(),
strip.background = element_blank(),
strip.text = element_text(face = "bold"),
strip.text.x = element_text(margin = margin(b = 10)),
legend.position = "right"
) +
facet_grid(. ~ category, scales = "free_x", labeller = label_both) +
scale_fill_manual(values = c("#b20710", "#221f1f")) +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1)
)
p
```
### rating catergory over each year
```{r}
plotdata <- mydata %>%
group_by(rating, release_year) %>%
summarize(n = n())
p3 <- plotdata %>%
filter(release_year >= 2000) %>%
mutate(category = case_when(
rating %in% c("G", "TV-G", "TV-Y") ~ "Little Kids",
rating %in% c("PG", "TV-PG", "TV-Y7") ~ "Older Kids",
rating %in% c("TV-Y7-FV", "PG-13", "TV-14") ~ "Teenagers",
rating %in% c("NC-17", "NR", "R", "TV-MA", "UR") ~ "Adults",
TRUE ~ "Other" # Add a default category for any unexpected ratings
)) %>%
ggplot(aes(release_year, n, fill = category)) +
geom_bar(stat = "identity", position = "fill") +
scale_y_continuous(labels = scales::percent_format(scale = 100)) +
labs(
x = "Release Year",
y = "Count (Percentage)",
title = "Distribution of Ratings Over Years"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("#221f1f", "#5e4d4d", "#b20710", "#ec8b90", "#ec8b06"))
p3_plotly <- ggplotly(p3)
p3_plotly
```
### The distribution of rating
```{r}
### The distribution of rating
plotdata <- mydata %>%
group_by(rating) %>%
summarize(n = n())
# bar chart with rotated labels
p1 <- ggplot(
plotdata,
aes(
x = reorder(rating, -n),
y = n,
fill = rating
)
) +
geom_bar(stat = "identity") +
labs(
x = "Rating",
y = "Count",
title = "The distribution of rating"
) +
theme(axis.text.x = element_text(
angle = 45,
hjust = 1
)) +
scale_fill_manual(values = c(
"#221f1f", "#301D1C", "#3E1B1B", "#4C1919", "#5A1717",
"#681515", "#761313", "#841111", "#921010", "#A00E0E",
"#AE0C0C", "#BC0A0A", "#CA0808", "#B20710", "#b20710", "#ec8b06", "#ec8b90"
))
p1_plotly <- ggplotly(p1)
p1_plotly
```
Country
=====================================
## Column {.tabset data-width="650"}
### Country vs Types
```{r}
# Pre-process
df <- read.csv("data/netflix_titles.csv", na.strings = c("", "NA"))
grouped <- df %>%
filter(!is.na(country) & country != "") %>% # remove the NA country
filter(!is.na(date_added)) %>% # remove the NA date_added
mutate(year_added = year(parse_date(date_added, "%B %d, %Y"))) %>% # extract year_added
mutate(country = strsplit(as.character(country), ",")) %>%
mutate(country = lapply(country, trimws)) %>%
unnest(country) %>%
group_by(country, year_added, type) %>%
summarise(cnt = n()) %>%
filter(!is.na(country) & country != "")
by_country_type <- grouped %>%
group_by(country, type) %>%
summarise(cnt = sum(cnt))
type_prop <- by_country_type %>%
group_by(country) %>%
mutate(prop = round(cnt / sum(cnt) * 100, 1)) %>%
group_by(country) %>%
summarise(total = sum(cnt), prop = prop, type = type) %>%
as.data.frame() %>%
top_n(20, wt = total)
custom_order <- type_prop %>%
arrange(desc(ifelse(type == "Movie", prop, -prop))) %>%
select(country) %>%
array() %>%
flatten() %>%
unique()
ggplot(type_prop, aes(y = factor(country, levels = custom_order), x = prop, fill = type)) +
geom_bar(stat = "identity") +
geom_text(
aes(label = scales::percent(prop / 100)),
position = position_stack(vjust = 0.5),
color = "white",
size = 3
) +
labs(
title = "Proportions of Movie and TV Show by Country",
y = "Country",
x = "Proportion (%)",
fill = "Type"
) +
scale_x_continuous(labels = scales::percent_format(scale = 1), limits = c(0, 100)) +
scale_fill_manual(values = c("#221f1f", "#b20710")) +
theme_minimal()
```
# Keywords {.storyboard}
```{r}
library(tidyverse)
library(purrr) # for functional programming
library(wordcloud2) # for creating word cloud
df <- read.csv("data/netflix_titles.csv", na.strings = c("", "NA"))
netflix_color <- colorRampPalette(c("#221f1f", "#b20710", "#e50914"))(10)
```
```{r}
COMMON_WORDS <- c("a", "in", "at", "be", "of", "the", "an", "to", "on", "he", "she", "and", "his", "with", "her", "for", "their", "when", "this", "from", "as", "is", "by", "after", "that", "who", "but", "into", "up", "they", "him", "out", "must", "are", "about", "it", "its", "while", "one", "them", "where", "has", "more", "over", "have", "off", "two", "s")
extract_words <- function(sentences) {
strsplit(sentences, "\\s+") %>%
flatten_chr() %>%
map(str_remove_all, "[^a-zA-Z\\s]") %>%
map(tolower) %>%
flatten_chr() %>%
discard(function(x) x == "") %>% # remove empty string
discard(function(x) x %in% COMMON_WORDS)
}
word_cloud <- function(data) {
words <- extract_words(data$description)
occur <- table(words)
occur <- sort(occur, decreasing = T)[1:500]
wordcloud2(occur, color = netflix_color)
}
```
### cloud1
```{r}
word_cloud(df %>% filter(2000 <= release_year & release_year < 2010))
```
### cloud2
```{r}
word_cloud(df %>% filter(2010 <= release_year & release_year < 2020))
```